% Using R and Longitudinal Data System Records to Answer Policy Questions
% Jared Knowles

Overview

Why R?

R

Google Scholar Hits

R has recently passed Stata on Google Scholar hits and it is catching up to the two major players SPSS and SAS

R Has an Active Web Presence

R is linked to from more and more sites

R Extensions

These links come from the explosion of add-on packages to R

R Has an Active Community

Usage of the R listserv for help has really exploded recently

R Examples

Read in Data

studat <- read.csv("data/smalldata.csv")
str(studat[5:18, ])
## 'data.frame':    14 obs. of  32 variables:
##  $ X          : int  274 276 478 574 613 620 717 772 1004 1056 ...
##  $ school     : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ stuid      : int  142705 14995 120205 103495 55705 28495 37705 52705 41995 10705 ...
##  $ grade      : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ schid      : int  205 495 205 495 205 495 205 205 495 205 ...
##  $ dist       : int  75 105 15 45 75 45 75 75 105 75 ...
##  $ white      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ black      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hisp       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ indian     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ asian      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ econ       : int  1 1 1 1 1 0 1 0 1 1 ...
##  $ female     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ell        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ disab      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sch_fay    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ dist_fay   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ luck       : int  0 0 1 0 0 0 0 1 0 0 ...
##  $ ability    : num  81.9 101.9 87.3 96.6 98.4 ...
##  $ measerr    : num  52.98 22.6 4.67 -9.35 -7.7 ...
##  $ teachq     : num  56.68 71.62 66.88 75.21 4.95 ...
##  $ year       : int  2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
##  $ attday     : int  156 157 169 180 170 152 162 180 152 165 ...
##  $ schoolscore: num  56 56 56 56 56 ...
##  $ district   : int  3 3 3 3 3 3 3 3 3 3 ...
##  $ schoolhigh : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ schoolavg  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ schoollow  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ readSS     : num  373 437 418 454 310 ...
##  $ mathSS     : num  441 463 436 434 284 ...
##  $ proflvl    : Factor w/ 4 levels "advanced","basic",..: 2 4 4 4 3 2 4 4 2 3 ...
##  $ race       : Factor w/ 5 levels "A","B","H","I",..: 2 2 2 2 2 2 2 2 2 2 ...
source("data/simulate_data.R")

Simple Diagnostics

source("ggplot2themes.R")
library(ggplot2)
qplot(readSS, mathSS, data = studat, alpha = I(0.2)) + geom_smooth(aes(group = ell, 
    color = factor(ell))) + theme_dpi()

plot of chunk unnamed-chunk-1

Advanced Diagnostics

samp <- sample(studat$stuid, 24)
plotsub <- subset(studat, stuid %in% samp)
qplot(grade, readSS, data = plotsub) + facet_wrap(~stuid, nrow = 4, 
    ncol = 6) + theme_dpi() + geom_line() + geom_smooth(method = "lm", se = FALSE)

plot of chunk unnamed-chunk-2

More Advanced

What we do?

Evaluations of Policy

Results

Conclusions and Next Steps

Conclusions

Next Steps

Inference Trees

Outline of the conditional inference tree structure.

library(partykit)
mypar <- ctree_control(testtype = "Bonferroni", mincriterion = 0.99)
mytree <- ctree(mathSS ~ race + econ + ell + disab + sch_fay + dist_fay + 
    attday + readSS, data = subset(studat, grade == 3))
plot(mytree)

plot of chunk parttree

Can Standardize and Share / Compare Results

Code collaboration

Some code sharing exists

Can do more

##   sid school_year race_ethnicity
## 1   1        2004              B
## 2   1        2005              H
## 3   1        2006              H
## 4   1        2007              H
## 5   2        2006              W
## 6   2        2007              B

What business rules do we use?

What to do?

head(stuatt, 4)
##   sid school_year race_ethnicity
## 1   1        2004              B
## 2   1        2005              H
## 3   1        2006              H
## 4   1        2007              H

Fix the data

stuatt$race2 <- stuatt$race_ethnicity
stuatt$race2[[1]] <- "H"
head(stuatt, 4)
##   sid school_year race_ethnicity race2
## 1   1        2004              B     H
## 2   1        2005              H     H
## 3   1        2006              H     H
## 4   1        2007              H     H

Do analytics on fixed data

# Setup data
student_long$year <- as.numeric(student_long$year)
student_long$proflvl <- as.numeric(student_long$proflvl)
# Set aside test set
testset <- sample(unique(student_long$stuid), 190000)
student_long$case <- 0
student_long$case[student_long$stuid %in% testset] <- 1
# Draw a training set of data (random subset of students)
training <- subset(student_long, case == 0)
testing <- subset(student_long, case == 1)

training <- training[, c(3, 6:16, 21, 22, 28, 29, 30)]  # subset vars
trainX <- training[, 1:15]

# Parameters
ctrl <- trainControl(method = "repeatedcv", number = 15, repeats = 5, 
    summaryFunction = defaultSummary)
# Search grid
grid <- expand.grid(.interaction.depth = seq(2, 6, by = 1), .n.trees = seq(200, 
    800, by = 50), .shrinkage = c(0.01, 0.1))
# Boosted tree search
gbmTune <- train(x = trainX, y = training$mathSS, method = "gbm", 
    metric = "RMSE", trControl = ctrl, tuneGrid = grid, verbose = FALSE)
gbmPred <- predict(gbmTune, testing[, names(trainX)])

# svmTune<-train(x=trainX, y=training$mathSS, method='svmLinear',
# tuneLength=3, metric='RMSE', trControl=ctrl)

Why?

Plot

qplot(testing$mathSS, gbmPred, geom = "hex", binwidth = c(10, 10)) + 
    geom_smooth() + theme_dpi()

plot of chunk modeldiag1

Plot 2

qplot(testing$mathSS, testing$mathSS - gbmPred, geom = "hex", binwidth = c(10, 
    10)) + geom_smooth() + theme_dpi()

plot of chunk modeldiag2

Plot 3

qplot(testing$mathSS - gbmPred, binwidth = 7) + theme_dpi() + xlim(c(-60, 
    60))

plot of chunk modeldiag3

Plot 4

plot(gbmTune)

plot of chunk modeldiag4

Eratta